library (data.table)
library(ggbiplot)
dt_odds<- readRDS("~/Documents/Dersler/Fall-2018/IE-582/df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds")
dt_matches <- readRDS("~/Documents/Dersler/Fall-2018/IE-582/df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds")
# Filtering 5 bookmarkers and final odds
dt_odds<-dt_odds[bookmaker %in% c("10Bet","bet365","Betsafe","bwin","bet-at-home")]
dt_oddsf<-dt_odds[,rnk:=rank(-date,ties.method= "max"),
by = list(matchId,betType,oddtype,bookmaker)][rnk==1]
dt_oddss<-dt_oddsf[,.(matchId,bookmaker,betType,oddtype,odd)][betType %in% c("1x2","bts","ou","ha")]
#transpoze for bet
dt_oddst<-dcast(dt_oddss,matchId+bookmaker~betType+oddtype)
#remove null odd (bcs we have enough instance)
setnames(dt_oddst,c('1x2_odd1','1x2_odd2','1x2_oddX'),c("Home","Away","Tie"))
dt_oddst<-na.omit(dt_oddst)
# split score to compare
dt_matches[,c("score1","score2"):= tstrsplit(score,":",fixed=TRUE)]
dt_matches <- dt_matches[,score1:=as.integer(score1)] # to be ensure for calculation
dt_matches <- dt_matches[,score2:=as.integer(score2)]
dt_matches<-na.omit(dt_matches)
# add column to decide Overstatus and Outcometype
dt_matches$totalgoal <- dt_matches$score1 + dt_matches$score2
dt_matches$Overstatus <- ifelse(dt_matches$totalgoal>2.5,"Over", "Under")
dt_matches$Outcomes<-ifelse(dt_matches$score1>dt_matches$score2,"Home",ifelse(dt_matches$score1<dt_matches$score2,"Away","Tie"))
# I create setkey to join tables
setkey(dt_oddst,matchId)
setkey(dt_matches,matchId)
# I joined tables
dt_merged <- merge(dt_oddst,unique(dt_matches),all.x=TRUE)[,.(matchId,Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,Overstatus,bookmaker,ha_1,ha_2)]
dt_merged<-na.omit(dt_merged)
dt_merged2 <- merge(dt_oddst,unique(dt_matches),all.x=TRUE)[,.(matchId,Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,Outcomes,bookmaker,ha_1,ha_2)]
dt_merged2<-na.omit(dt_merged2)
#filter data (I used the first 250 instance to plot)
dt_merged<-dt_merged[1:250,.(Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,ha_1,ha_2,Overstatus,bookmaker)]
dt_merged.pca <- prcomp(dt_merged[,1:9],center = TRUE,scale. = TRUE)
summary(dt_merged.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9544 1.3813 1.1062 0.9924 0.9866 0.21786 0.15656
## Proportion of Variance 0.4244 0.2120 0.1360 0.1094 0.1082 0.00527 0.00272
## Cumulative Proportion 0.4244 0.6364 0.7723 0.8818 0.9899 0.99522 0.99795
## PC8 PC9
## Standard deviation 0.10562 0.08553
## Proportion of Variance 0.00124 0.00081
## Cumulative Proportion 0.99919 1.00000
PC1 encompass %43 of total variance, together with PC2 involves %65 of variance. They can provide insight to data
ggbiplot(dt_merged.pca,groups=dt_merged$Overstatus,scale = 0,choices = 1:2,obs.scale = 1, var.scale = 1,ellipse = TRUE)
Away , ha_2 and ou_under Tie feature are located close to each other. This indicates that variables are correlated with each other. Also Home , ha_1 , ou_over variables are located closely, they are also correlated with each other. These feature may combine to feature reduction
Over and Under results correlated, they generally ploted same regions
d_mcd<-dist(dt_merged[,-10],method = "euclidean")
MDS = cmdscale(d_mcd)
plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green"))
#[unclass(dt_merged$Overstatus)])
d_mcd<-dist(dt_merged[,-10],method = "manhattan")
MDS = cmdscale(d_mcd)
plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green"))
Variables in Manhattan distance has smaller variance than euclidean distance. MDS plotting show us Over and Under have high correlation like PCA results
# merge and filter matches outcome
dt_merged2<-dt_merged2[1:250,.(Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,ha_1,ha_2,Outcomes,bookmaker)]
dt_merged2.pca <- prcomp(dt_merged2[,1:9],center = TRUE,scale. = TRUE)
#names(dt_bet365.pca)
print(dt_merged2.pca)
## Standard deviations (1, .., p=9):
## [1] 1.95435060 1.38128540 1.10623624 0.99243457 0.98662598 0.21786416
## [7] 0.15656391 0.10562251 0.08553013
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5
## Home 0.298677020 -0.549597956 0.2501227 -0.04213578 -0.02086839
## Away -0.491049218 0.022279585 0.2178073 -0.09586584 -0.04514108
## Tie -0.392787000 -0.192246939 0.4794345 -0.19899760 -0.08289685
## bts_NO 0.300073950 0.414635221 0.4655681 -0.20154250 0.01536690
## bts_YES -0.331035896 -0.410437232 -0.4130381 0.16102173 0.04649162
## ou_over 0.004984756 -0.069422513 -0.2524744 -0.80110990 0.53683934
## ou_under -0.055482200 0.009252048 0.2603544 0.47815914 0.83571302
## ha_1 0.275068811 -0.561315686 0.2858488 -0.05597631 -0.02880088
## ha_2 -0.486812040 0.011195932 0.2440285 -0.09664530 -0.02854943
## PC6 PC7 PC8 PC9
## Home 0.03964179 -0.01513666 -0.21726939 0.703530583
## Away 0.12136258 -0.28501821 0.69690309 0.343610420
## Tie -0.17229441 0.68731715 -0.06180683 -0.162849119
## bts_NO 0.68732606 0.06606402 -0.05024250 -0.033440085
## bts_YES 0.69124996 0.18438629 -0.07172082 -0.079359162
## ou_over -0.03406611 -0.01438613 0.00676774 0.005055848
## ou_under -0.03713158 0.02018891 0.01453152 -0.001338908
## ha_1 0.03120514 -0.32673762 0.26390753 -0.588397025
## ha_2 0.01605154 -0.54808381 -0.62106978 -0.082475657
summary(dt_merged2.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9544 1.3813 1.1062 0.9924 0.9866 0.21786 0.15656
## Proportion of Variance 0.4244 0.2120 0.1360 0.1094 0.1082 0.00527 0.00272
## Cumulative Proportion 0.4244 0.6364 0.7723 0.8818 0.9899 0.99522 0.99795
## PC8 PC9
## Standard deviation 0.10562 0.08553
## Proportion of Variance 0.00124 0.00081
## Cumulative Proportion 0.99919 1.00000
PC1 encompass %43 of total variance, together with PC2 involves %65 of variance.
ggbiplot(dt_merged2.pca,groups=dt_merged2$Outcomes,scale = 0,choices = 1:2,obs.scale = 1, var.scale = 1,ellipse = TRUE)
Matches results are correlated each other. They are located similarly
library(jpeg)
image<-readJPEG("~/Documents/Dersler/Fall-2018/IE-582/image.jpg")
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(image, 0, 0, 512, 512)
Data stored as numeric and dimensions are [1:512, 1:512, 1:3]
# break into channel
r <- image[,,1]
g <- image[,,2]
b <- image[,,3]
#Multi plotting
par(mfrow=c(1,3))
image(t(r)[ncol(r):1,nrow(r):1])
image(t(g)[ncol(g):1,nrow(g):1])
image(t(b)[ncol(b):1,nrow(b):1])
# add noisy for every channel and pixel
for (i in 1:512)
{for (j in 1:512)
{for (z in 1:3)
{
image[i,j,z]=image[i,j,z]+runif(1,min=0,max=0.1)
if(image[i,j,z]>1){
image[i,j,z]=0.99
}
else {image[i,j,z]}
}
}
}
# Plot
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(image, 0, 0, 512, 512)
#break into channel
r <- image[,,1]
g <- image[,,2]
b <- image[,,3]
par(mfrow=c(1,3))
image(t(r)[ncol(r):1,nrow(r):1])
image(t(g)[ncol(g):1,nrow(g):1])
image(t(b)[ncol(b):1,nrow(b):1])
library(jpeg)
imagea=readJPEG("~/Documents/Dersler/Fall-2018/IE-582/imagegg.jpg")
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(imagea, 0, 0, 512, 512)
# Loop is for extraction
n=0
m=0
mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 2:510)
{for (j in 2:510)
{ n=0
m=m+1
for (k in -1:1)
{for (l in -1:1)
{
n=n+1
mt[m,n]= imagea[i+k,j+l]
# print(c(i+k,j+l))
# print(c(m,n))
}
}
}
}
# PCA analysis
mt.pca<-prcomp(mt[,1:9],center = FALSE,scale. = FALSE)
summary(mt.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.5876 0.04743 0.0449 0.01681 0.01347 0.01098
## Proportion of Variance 0.9993 0.00034 0.0003 0.00004 0.00003 0.00002
## Cumulative Proportion 0.9993 0.99960 0.9999 0.99995 0.99997 0.99999
## PC7 PC8 PC9
## Standard deviation 0.004988 0.004759 0.003043
## Proportion of Variance 0.000000 0.000000 0.000000
## Cumulative Proportion 1.000000 1.000000 1.000000
PC1 consist 0.99 of total variance. It means that ??t provide insight for %99 of data
#recall loops
deneme<-mt.pca$x[,1]%*%t(mt.pca$rotation[,1])
recall<-matrix(0,nrow=512,ncol = 512)
n=10
m=260101
for (i in 510:2)
{for (j in 510:2)
{ n=10
m=m-1
for (k in 1:-1)
{for (l in 1:-1)
{
n=n-1
if(deneme[m,n]>1)
{deneme[m,n]=1}
recall[i+k,j+l]=deneme[m,n]
}
}
}
}
#Plot
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(recall, 0, 0, 512, 512)
deneme2<-mt.pca$x[,2]%*%t(mt.pca$rotation[,2])
recall2<-matrix(0,nrow=512,ncol = 512)
n=10
m=260101
#mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 510:2)
{for (j in 510:2)
{ n=10
m=m-1
for (k in 1:-1)
{for (l in 1:-1)
{
n=n-1
if(deneme2[m,n]<0)
{deneme2[m,n]=0}
recall2[i+k,j+l]=deneme2[m,n]
}
}
}
}
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(recall2, 0, 0, 512, 512)
deneme3<-mt.pca$x[,3]%*%t(mt.pca$rotation[,3])
recall3<-matrix(0,nrow=512,ncol = 512)
n=10
m=260101
#mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 510:2)
{for (j in 510:2)
{ n=10
m=m-1
for (k in 1:-1)
{for (l in 1:-1)
{
n=n-1
if(deneme3[m,n]<0)
{deneme3[m,n]=0}
recall3[i+k,j+l]=deneme3[m,n]
}
}
}
}
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(recall3, 0, 0, 512, 512)
#Find Eigenvectors and transform image
tsk4<-mt.pca$rotation[,1]
imagev<-matrix(0,nrow=3,ncol = 3)
m=9
for (i in 1)
{for (j in 1)
{
imagev[i,j]=tsk4[m]*-1
}
}
plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "")
rasterImage(imagev, 0, 0, 3, 3)
tsk42<-mt.pca$rotation[2,]
imagev2<-matrix(0,nrow=3,ncol = 3)
m=9
#mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 1)
{for (j in 1)
{
imagev2[i,j]=tsk42[m]
}
}
plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "")
rasterImage(imagev2, 0, 0, 3, 3)
tsk43<-mt.pca$rotation[3,]
imagev3<-matrix(0,nrow=3,ncol = 3)
m=9
for (i in 1)
{for (j in 1)
{
imagev3[i,j]=tsk43[m]* -1
}
}
plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "")
rasterImage(imagev3, 0, 0, 3, 3)